# Always print this out before your assignment
sessionInfo()
getwd()
library('tidyverse')
-- Attaching packages ---------------------------------------------------------------------------------------------------------------------- tidyverse 1.3.1 --
v ggplot2 3.3.5 v purrr 0.3.4
v tibble 3.1.5 v stringr 1.4.0
v tidyr 1.1.4 v forcats 0.5.1
v readr 2.0.2
-- Conflicts ------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
x readr::col_factor() masks scales::col_factor()
x purrr::discard() masks scales::discard()
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
library("fs")
library('here')
here() starts at C:/Users/cabrooke/Documents/R/696/group project/final project/final_project
library('dplyr')
library('tidyverse')
library('ggplot2')
library('ggrepel')
library('ggthemes')
library('forcats')
library('rsample')
library('lubridate')
Attaching package: ‘lubridate’
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
library('ggthemes')
library('kableExtra')
Attaching package: ‘kableExtra’
The following object is masked from ‘package:dplyr’:
group_rows
library('pastecs')
Attaching package: ‘pastecs’
The following object is masked from ‘package:tidyr’:
extract
The following objects are masked from ‘package:dplyr’:
first, last
library('viridis')
Loading required package: viridisLite
Attaching package: ‘viridis’
The following object is masked from ‘package:scales’:
viridis_pal
library('plotly')
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
library('tidyquant')
Loading required package: PerformanceAnalytics
Loading required package: xts
Loading required package: zoo
Attaching package: ‘zoo’
The following objects are masked from ‘package:base’:
as.Date, as.Date.numeric
Attaching package: ‘xts’
The following objects are masked from ‘package:pastecs’:
first, last
The following objects are masked from ‘package:dplyr’:
first, last
Attaching package: ‘PerformanceAnalytics’
The following object is masked from ‘package:graphics’:
legend
Loading required package: quantmod
Loading required package: TTR
Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
== Need to Learn tidyquant? ===================================================================================================================================
Business Science offers a 1-hour course - Learning Lab #9: Performance Analysis & Portfolio Optimization with tidyquant!
</> Learn more at: https://university.business-science.io/p/learning-labs-pro </>
library('scales')
Final Project Cleaning and Summary Statistics
1a) Loading data
#Reading the data in and doing minor initial cleaning in the function call
#Reproducible data analysis should avoid all automatic string to factor conversions.
#strip.white removes white space
#na.strings is a substitution so all that have "" will = na
data <- read.csv(here::here("final_project", "donor_data.csv"),
stringsAsFactors = FALSE,
strip.white = TRUE,
na.strings = "")
1b) Fixing the wonky DOB & Data cleanup
glimpse(data_cleaned$zipslry_range)
logi [1:323000] NA NA NA NA NA NA ...
1c Creating factor variable for sex and married
data_cleaned <-
data_cleaned %>%
mutate(sex_fct =
fct_explicit_na(Sex)
)
data_cleaned <-
data_cleaned %>%
mutate(
sex_simple =
fct_lump_n(Sex, n = 4)
)
#checking to see if its a factor
class(data_cleaned$sex_fct)
#checking levels
levels(data_cleaned$sex_simple)
#creating a table against Sex column
table(data_cleaned$sex_fct, data_cleaned$sex_simple)
#making married a factor
data_cleaned_columns <-
data_cleaned_columns %>%
mutate(married_fct =
fct_explicit_na(Married)
)
#checking to see if its a factor
class(data_cleaned$married_fct)
1d #Mean, Median, and Count of Giving in Age Ranges
age_range_giving <- datacleaning %>%
group_by(age_range) %>%
summarise(avg_giving = mean(HH.Lifetime.Giving, na.rm = TRUE),
med_giving = median(HH.Lifetime.Giving, na.rm = TRUE),
amount_of_people_in_age_range = n())
Question 2
DonorSegment Analysis
#grouping by donorsegment and analyzing
data_cleaned_columns %>%
group_by(Donor.Segment) %>%
summarise(Count = length(Donor.Segment),
mean_total_giv = mean(HH.Lifetime.Giving)) %>%
arrange(-Count) %>%
filter(Count >= 100) %>%
#added scales package to have the values show in dollar
mutate(mean_total_giv = dollar(mean_total_giv)) %>%
kable(col.names = c("Donor Segment", "Count", "Mean HH Lifetime Giving"), align=rep('c', 3)) %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| Donor Segment |
Count |
Mean HH Lifetime Giving |
| NA |
232033 |
$0 |
| Lost Donor |
69733 |
$4,958 |
| Lapsed Donor |
11220 |
$11,195 |
| Current Donor |
5704 |
$104,142 |
| Lapsing Donor |
3879 |
$16,595 |
| At-Risk Donor |
657 |
$85,198 |
NA
NA
2a) Plotting average giving by age range
ggplot(age_range_giving, aes(avg_giving, age_range)) +
geom_bar(stat = "identity")

NA
NA
2b) Count of donors based on age range (another way to look at it)
ggplot(datacleaning,
aes(age_range)) +
geom_bar() +
theme(axis.text.x = element_text(angle=45,
hjust=1)) +
labs(title = "Count of Age Ranges", x = "", y = "")

NA
NA
2c) Boxplot of the Age Ranges Against the Lifetime Giving Amounts with a log scale applied - the reason we applied log scale is to resolve issues with visualizations that skew towards large values in our dataset.
ggplot(datacleaning, aes(age_range,HH.Lifetime.Giving,fill = age_range)) +
geom_boxplot(
outlier.colour = "red") +
scale_y_log10() +
theme(axis.text.x=element_text(angle=45,hjust=1))
Warning: Transformation introduced infinite values in continuous y-axis
Warning: Removed 232033 rows containing non-finite values (stat_boxplot).

NA
NA
2d) Splitting by age and gender
#creating boxplots
datacleaning %>%
filter(Age < 100) %>% #removing the weird outliers that are over 100
filter(Sex %in% c("M", "F")) %>%
ggplot(aes(Sex, Age)) +
geom_boxplot() +
theme_economist() +
ggtitle("Ages of Donors Based on Gender") +
xlab(NULL) + ylab(NULL)

NA
NA
NA
NA
NA
2e) Distribution of people in the states that they live.
datacleaning %>%
mutate(State = ifelse(State == " ", "NA", State)) %>%
filter(State != "NA") %>%
group_by(State) %>%
summarise(Count = length(State)) %>%
filter(Count > 800) %>%
arrange(-Count) %>%
kable(col.names = c("Donor's State", "Count")) %>%
kable_styling(bootstrap_options = c("condensed"),
full_width = F)
| Donor's State |
Count |
| CA |
176695 |
| WA |
7958 |
| TX |
7268 |
| NY |
5661 |
| CO |
5073 |
| AZ |
4929 |
| OR |
4613 |
| FL |
4111 |
| IL |
3681 |
| HI |
3394 |
| PA |
2904 |
| OH |
2754 |
| NV |
2715 |
| MI |
2524 |
| MA |
2473 |
| NJ |
2311 |
| VA |
2158 |
| NC |
2087 |
| GA |
2045 |
| MO |
1889 |
| MN |
1732 |
| MD |
1488 |
| TN |
1443 |
| IN |
1417 |
| CT |
1380 |
| WI |
1330 |
| UT |
1174 |
| OK |
1151 |
| AL |
1120 |
| LA |
1110 |
| ID |
1096 |
| SC |
1076 |
| KY |
1032 |
| KS |
1027 |
| NM |
982 |
| IA |
880 |
NA
NA
NA
NA
NA
NA
2f) Looking at all donors first gift amount. 75% made a first gift of <100.
no_non_donors <- datacleaning %>%
filter(Lifetime.Giving != 0)
nd <- quantile(no_non_donors$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)
nd <- as.data.frame(nd)
nd %>%
kable(col.names = "Quantile") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
full_width = F)
| |
Quantile |
| 25% |
3.8 |
| 50% |
25.0 |
| 75% |
100.0 |
| 90% |
500.0 |
| 99% |
15000.0 |
NA
NA
NA
NA
Modeling for you
3a) Linear model
#converting married Y and N to 1 and 0
datacleaning <- datacleaning %>%
mutate(Married_simple = ifelse(Married == "N",0,1))
mod1lm <- lm( Married_simple ~ Lifetime.Giving,
data = datacleaning)
summary(mod1lm)
Call:
lm(formula = Married_simple ~ Lifetime.Giving, data = datacleaning)
Residuals:
Min 1Q Median 3Q Max
-0.4107 -0.2872 -0.2872 0.7128 0.7128
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.287196453375 0.000796143577 360.73 <0.0000000000000002 ***
Lifetime.Giving 0.000000006818 0.000000007174 0.95 0.342
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.4525 on 323224 degrees of freedom
Multiple R-squared: 2.795e-06, Adjusted R-squared: -2.991e-07
F-statistic: 0.9033 on 1 and 323224 DF, p-value: 0.3419
Kmeans
is.numeric(data_cleaned$HH.Lifetime.Giving)
[1] TRUE
3a)
p <- datacleaning %>%
ggplot(aes(Age)) + geom_histogram(bins=30, fill = "blue") + theme_economist_white() +
ggtitle("Overall Donor Age Distribution") +
xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(5,100,by = 20)) +
scale_y_continuous(breaks = seq(20,100,by = 20)) + xlim(c(20,100))
Scale for 'x' is already present. Adding another scale for 'x', which will replace the existing scale.
ggplotly(p)
Warning: Removed 199288 rows containing non-finite values (stat_bin).
p
Warning: Removed 199288 rows containing non-finite values (stat_bin).
Warning: Removed 2 rows containing missing values (geom_bar).

ggplot(data = datacleaning, aes(x = Age)) + geom_histogram(fill ="blue")+ xlim(c(20,100))
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 199288 rows containing non-finite values (stat_bin).
Warning: Removed 2 rows containing missing values (geom_bar).

NA
NA
NA
---
title: "BROCODE Summary Statistics"
author: "Aaron, Cannon, Josh, Ryan"
subtitle: Final Project Summary Statistics
output:
  html_document:
    df_print: paged
  html_notebook: default
---

```{r setup, include=FALSE}

# Please leave this code chunk as is. It makes some slight formatting changes to alter the output to be more aesthetically pleasing. 

library(knitr)


# Change the number in set seed to your own favorite number
set.seed(1818)
options(width=70)
options(scipen=99)


# this sets text outputted in code chunks to small
opts_chunk$set(tidy.opts=list(width.wrap=50),tidy=TRUE, size = "vsmall")  
opts_chunk$set(message = FALSE,                                          
               warning = FALSE,
               # "caching" stores objects in code chunks and only rewrites if you change things
               cache = TRUE,                               
               # automatically downloads dependency files
               autodep = TRUE,
               # 
               cache.comments = FALSE,
               # 
               collapse = TRUE,
               # change fig.width and fig.height to change the code height and width by default
               fig.width = 5.5,  
               fig.height = 4.5,
               fig.align='center')


```

```{r setup-2}

# Always print this out before your assignment
sessionInfo()
getwd()

```


<!-- ### start answering your problem set here -->
<!-- You may export your homework in either html or pdf, with the former usually being easier. 
     To export or compile your Rmd file: click above on 'Knit' then 'Knit to HTML' -->
<!-- Be sure to submit both your .Rmd file and the compiled .html or .pdf file for full credit -->


```{r setup-3}

# load all your libraries in this chunk 
library('tidyverse')
library("fs")
library('here')
library('dplyr')
library('tidyverse')
library('ggplot2')
library('ggrepel')
library('ggthemes')
library('forcats')
library('rsample')
library('lubridate')
library('ggthemes')
library('kableExtra')
library('pastecs')
library('viridis')
library('plotly')
library('tidyquant')
library('scales')


# note, do not run install.packages() inside a code chunk. install them in the console outside of a code chunk. 

```



## Final Project Cleaning and Summary Statistics 

1a) Loading data

```{r}

#Reading the data in and doing minor initial cleaning in the function call
#Reproducible data analysis should avoid all automatic string to factor conversions.
#strip.white removes white space 
#na.strings is a substitution so all that have "" will = na
data <- read.csv(here::here("final_project", "donor_data.csv"),
                 stringsAsFactors = FALSE,
                 strip.white = TRUE,
                 na.strings = "")

```


1b) Fixing the wonky DOB & Data cleanup

```{r}

#(Birthdate and Age, ID as a number)adding DOB (Age/Spouse Age) in years columns and adding two fields for assignment and number of children
datacleaning <- data %>%
  mutate(Birthdate = ifelse(Birthdate == "0001-01-01", NA, Birthdate)) %>%
  mutate(Birthdate = mdy(Birthdate)) %>%
  mutate(Age = as.numeric(floor(interval(start= Birthdate, end=Sys.Date())/duration(n=1, unit="years")))) %>%
  mutate(Spouse.Birthdate = ifelse(Spouse.Birthdate == "0001-01-01", NA, Spouse.Birthdate)) %>%
  mutate(Spouse.Birthdate = mdy(Spouse.Birthdate)) %>%
  mutate(Spouse.Age = as.numeric(floor(interval(start= Spouse.Birthdate,
                                                end=Sys.Date())/duration(n=1, unit="years")))) %>%
  mutate(ID = as.numeric(ID)) %>% 
  mutate(Assignment_flag = ifelse(is.na(Assignment.Number), 0,1)) %>% 
  mutate( No_of_Children = ifelse(is.na(Child.1.ID),0,
                            ifelse(is.na(Child.2.ID),1,2)))

#splitting up the age into ranges and creating category for easy visualization 
datacleaning <- datacleaning %>%
  mutate(age_range = 
    ifelse(Age %in% 10:19, "10 < 20 year olds",
    ifelse(Age %in% 20:29, "20 < 30 year olds", 
    ifelse(Age %in% 30:39, "30 < 40 year olds",
    ifelse(Age %in% 40:49, "40 < 50 year olds",
    ifelse(Age %in% 50:59, "50 < 60 year olds",
    ifelse(Age %in% 60:69, "60 < 70 year olds",
    ifelse(Age %in% 70:79, "70 < 80 year olds",
    ifelse(Age %in% 80:89, "80 < 90 year olds",
    ifelse(Age %in% 90:99, "90 < 100 year olds",
    ifelse(Age %in% 100:109, "100 < 110 year olds",
    ifelse(Age %in% 110:120, "110 - 120  year olds",
    NA))))))))))))

#splitting zipcode salary into ranges for easy visualization 
data_cleaned <- data_cleaned %>%
  mutate(zipslry_range = 
    ifelse(zipcode_slry %in% 90000:99000, "90K-99K",
    ifelse(zipcode_slry %in% 100000:149000, "100K-149K", 
    ifelse(zipcode_slry %in% 150000:199000, "150K-199K",
    ifelse(zipcode_slry %in% 200000:249000, "200K-249K",
    ifelse(zipcode_slry %in% 250000:299000, "250K-299K",
    ifelse(zipcode_slry %in% 300000:349000, "300K-349K",
    ifelse(zipcode_slry %in% 350000:399000, "350K-399K",
    ifelse(zipcode_slry %in% 400000:499000, "400K-499K",
    ifelse(zipcode_slry %in% 500000:999000, "500K-999K",
    NA))))))))))

glimpse(data_cleaned$zipslry_range)


#seeing what we have
table(datacleaning$age_range)
#50-60 is the most common age range 

#Removing Columns that provide no benefit 

data_cleaned_columns <- subset(datacleaning,select = -c(Assignment.Number
                                                        ,Assignment.has.Historical.Mngr
                                                        ,Suffix
                                                        ,Assignment.Date
                                                        ,Assignment.Manager
                                                        ,Assignment.Role
                                                        ,Assignment.Title
                                                        ,Assignment.Status
                                                        ,Strategy
                                                        ,Progress.Level
                                                        ,Assignment.Group
                                                        ,Assignment.Category
                                                        ,Funding.Method
                                                        ,Expected.Book.Date
                                                        ,Qualification.Amount
                                                        ,Expected.Book.Amount
                                                        ,Expected.Book.Date
                                                        ,Hard.Gift.Total
                                                        ,Soft.Credit.Total
                                                        ,Total.Assignment.Gifts
                                                        ,No.of.Pledges
                                                        ,Proposal..
                                                        ,Proposal.Notes
                                                        ,HH.Life.Hard.Credit
                                                        ,HH.Life.Soft.Credit
                                                        ,HH.Life.Spouse.Credit
                                                        ,Last.Contact.By.Manager
                                                        ,X..of.Contacts.By.Manager))
#cleaning up zip codes removing -4 after 
data_cleaned_columns$Zip <- gsub(data_cleaned_columns$Zip, pattern="-.*", replacement = "")

#adding zip code data and column 
zip <- read.csv(here::here("final_project", "Salary_Zipcode.csv"),
                 stringsAsFactors = FALSE,
                 strip.white = TRUE,
                 na.strings = "")

#adding zip salary column
data_cleaned_columns <-data_cleaned_columns %>%
    mutate(zipcode_slry = VLOOKUP(Zip, zip, NAME, S1902_C03_002E))

#adding scholarship data (y/n)
schlr <- read.csv(here::here("final_project", "scholarship.csv"),
                 stringsAsFactors = FALSE,
                 strip.white = TRUE,
                 na.strings = "")

#adding scholarship column
data_cleaned_columns <-data_cleaned_columns %>%
    mutate(scholarship = VLOOKUP(ID, schlr, ID, SCHOLARSHIP)) 

#replacing NA with 0 
 data_cleaned_columns$scholarship <- replace_na(data_cleaned_columns$scholarship,'0')
 
#replacing Y with 1 
data_cleaned_columns$scholarship<-ifelse(data_cleaned_columns$scholarship=="Y",1,0)

#checking how many are N
table(data_cleaned_columns$scholarship)


#checking and deleting scholarship column 
class(data_cleaned_columns$schlr_fct)
data_cleaned_columns = subset(data_cleaned_columns, select = -c(scholarship))
  
#checking for duplicates N >1 indicates a records values are in the file twice 
data_cleaned_columns %>% group_by(ID) %>% count() %>% arrange(desc(n))

#removing duplicated records

data_cleaned <- unique(data_cleaned_columns)

#n = 1 no ID with multiple records cleaned of dupes
data_cleaned %>% group_by(ID) %>% count() %>% arrange(desc(n))

```

1c Creating factor variable for sex and married 

```{r}

data_cleaned <- 
  data_cleaned %>% 
  mutate(sex_fct = 
           fct_explicit_na(Sex)
  )


data_cleaned <-
data_cleaned %>% 
mutate(
  sex_simple = 
    fct_lump_n(Sex, n = 4)
)

#checking to see if its a factor
class(data_cleaned$sex_fct)

#checking levels
levels(data_cleaned$sex_simple)

#creating a table against Sex column 
table(data_cleaned$sex_fct, data_cleaned$sex_simple)

#making married a factor 
data_cleaned_columns <- 
  data_cleaned_columns %>% 
  mutate(married_fct = 
           fct_explicit_na(Married)
  )

#checking to see if its a factor
class(data_cleaned$married_fct)


```




```{r}



```

1d #Mean, Median, and Count of Giving in Age Ranges 

```{r}

age_range_giving <- datacleaning %>%
  group_by(age_range) %>%
  summarise(avg_giving = mean(HH.Lifetime.Giving, na.rm = TRUE),
            med_giving = median(HH.Lifetime.Giving, na.rm = TRUE),
            amount_of_people_in_age_range = n())



```





## Question 2

DonorSegment Analysis

```{r}
#grouping by donorsegment and analyzing 
data_cleaned_columns %>%
  group_by(Donor.Segment) %>%
  summarise(Count = length(Donor.Segment),
            mean_total_giv = mean(HH.Lifetime.Giving)) %>%
  arrange(-Count) %>%
  filter(Count >= 100) %>%
  #added scales package to have the values show in dollar 
  mutate(mean_total_giv = dollar(mean_total_giv)) %>%
  kable(col.names = c("Donor Segment", "Count", "Mean HH Lifetime Giving"), align=rep('c', 3)) %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = F)
  

```

2a) Plotting average giving by age range 


```{r}

ggplot(age_range_giving, aes(avg_giving, age_range)) +
  geom_bar(stat = "identity")


```


2b) Count of donors based on age range (another way to look at it)


```{r}

ggplot(datacleaning, 
       aes(age_range)) + 
       geom_bar() + 
       theme(axis.text.x = element_text(angle=45,
                                        hjust=1)) + 
  labs(title = "Count of Age Ranges", x = "", y = "")
  

```

2c) Boxplot of the Age Ranges Against the Lifetime Giving Amounts with a log scale applied - the reason we applied log scale is to resolve issues with visualizations that skew towards large values in our dataset. 


```{r}

ggplot(datacleaning, aes(age_range,HH.Lifetime.Giving,fill = age_range)) + 
  geom_boxplot(
  outlier.colour = "red") + 
  scale_y_log10() +
  theme(axis.text.x=element_text(angle=45,hjust=1))
  

```

2d) Splitting by age and gender 


```{r}


#creating boxplots 
datacleaning %>% 
  filter(Age < 100) %>% #removing the weird outliers that are over 100 
  filter(Sex %in% c("M", "F")) %>%
  ggplot(aes(Sex, Age)) + 
  geom_boxplot() + 
  theme_economist() + 
  ggtitle("Ages of Donors Based on Gender") + 
  xlab(NULL) + ylab(NULL)
  
  
  


```

2e) Distribution of people in the states that they live.

```{r}

  datacleaning %>%
  mutate(State = ifelse(State == " ", "NA", State)) %>%
  filter(State != "NA") %>%
  group_by(State) %>%
  summarise(Count = length(State)) %>%
  filter(Count > 800) %>%
  arrange(-Count) %>%
  kable(col.names = c("Donor's State", "Count")) %>%
  kable_styling(bootstrap_options = c("condensed"),
                full_width = F)
  
 
  
  


```

2f) Looking at all donors first gift amount. 75% made a first gift of <100. 

```{r}

 no_non_donors <- datacleaning %>%
  filter(Lifetime.Giving != 0)
  
nd <- quantile(no_non_donors$HH.First.Gift.Amount, probs = c(.25,.50,.75,.9,.99), na.rm = TRUE)

nd <- as.data.frame(nd)

nd %>%
  kable(col.names = "Quantile") %>%
  kable_styling(bootstrap_options = c("striped", "hover"),
                full_width = F)
  
  


```



## Modeling for you 


3a) Linear model 

```{r}
#converting married Y and N to 1 and 0 
datacleaning <- datacleaning %>%
      mutate(Married_simple = ifelse(Married == "N",0,1))
 

mod1lm <- lm( Married_simple ~ Lifetime.Giving,
           data = datacleaning)

summary(mod1lm)
  


```
Kmeans
```{r}

pred_vars <- c('married_fct', 'sex_fct') 
 
 
data_cleaned_K <- select(data_cleaned,
                     pred_vars,
                     HH.Lifetime.Giving)
 
#build cluster
dd_kmeans <- kmeans(x = data_cleaned_K, 
                    centers = 5, 
                    nstart = 10)

is.numeric(data_cleaned$HH.Lifetime.Giving)

```





3a) 

```{r}
p <- datacleaning %>%
  ggplot(aes(Age)) + geom_histogram(bins=30, fill = "blue") + theme_economist_white() +
  ggtitle("Overall Donor Age Distribution") + 
  xlab(NULL) + ylab(NULL) + scale_x_continuous(breaks = seq(5,100,by = 20)) +
  scale_y_continuous(breaks = seq(20,100,by = 20)) + xlim(c(20,100))

ggplotly(p)
  
p

ggplot(data = datacleaning, aes(x = Age)) + geom_histogram(fill ="blue")+ xlim(c(20,100))

  


```
